Author: Leonardo Espin
Date: 11/17/2018
#importing libraries
import pandas as pd
from datetime import datetime #for date operations
from datetime import timedelta
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import xml.etree.ElementTree as et #for parsing the XML file
#parsing with ElementTree
parsed_xml = et.parse('../../../Desktop/2010-2018.xml')
df=pd.DataFrame(columns=['created','time','sport','duration',
'calories','distance','heart-rate_ave','heart-rate_max',
'weight','note'])
The code below extracts data from the XML file and adds it to a dataframe. It is written according to the tree-structure of the XML file:
<polar-exercise-data>
<exercise>
<created>2010-04-18 15:31:20.0</created>
<time>2010-04-17 19:14:00.0</time>
<sport>elliptical machine</sport>
<result>
<duration>00:32:37</duration>
<calories>410</calories>
<user-settings>
<heart-rate>
<resting>68</resting>
<maximum>190</maximum>
</heart-rate>
<weight>75.0</weight>
<height>175.0</height>
</user-settings>
<zones>
<zone index="0">
<upper>151</upper>
<lower>114</lower>
<in-zone>00:00</in-zone>
</zone>
</zones>
<heart-rate>
<average>149</average>
<maximum>168</maximum>
</heart-rate>
<speed>
<type>CYCLING</type>
</speed>
</result>
</exercise>
def getval(node):
#needed to avoid error of text not present if node is not present
return node.text if node is not None else None
count=0
for i,node in enumerate(parsed_xml.getroot()):
created=node.find('created')
time=node.find('time')
sport=node.find('sport')
if sport == None:
sport=node.find('name')
duration=node.find('target/duration')
calories=node.find('target/calories')
distance=node.find('target/distance')
aveHR=node.find('result/heart-rate/average')
maxHR=node.find('result/heart-rate/maximum')
weight=node.find('result/user-settings/weight')
note=node.find('note')
count +=1
else:
duration=node.find('result/duration')
calories=node.find('result/calories')
distance=node.find('result/distance')
aveHR=node.find('result/heart-rate/average')
maxHR=node.find('result/heart-rate/maximum')
weight=node.find('result/user-settings/weight')
note=node.find('note')
df.loc[i] = [getval(created),getval(time),getval(sport),getval(duration),
getval(calories),getval(distance),getval(aveHR),getval(maxHR),
getval(weight),getval(note)]
print(df.shape)
print('number of unusual rows: {}'.format(count))
df.head()
note that the duration column has data with different formats, I want to correct that.
#let's check the end of the dataframe
df.tail()
the last two rows have no information, so I drop them
df.drop([1622,1623],inplace=True)
df.tail()
What sports are the most common?
df['sport'].value_counts().plot.bar(figsize=(10, 6));
Let's check which other rows are useless by checking empty values:
df.isna().sum()
If the 'sport' row is missing, what else is missing:
df[df['sport'].isna()]
the rows above are useless, so let's drop them:
df.drop(df[df['sport'].isna()].index,inplace=True)
#check that missing data was removed
df['sport'].isna().sum()
#checking the data types
df.dtypes
I need to convert the data to the appropriate types in order to be able to work with the data:
df['created']=df['created'].astype('M')
df['time']=df['time'].astype('M')
#df['duration']=pd.to_timedelta(df['duration'])
df['calories']=df['calories'].astype('float64')
df['distance']=df['distance'].astype('float64')
df['heart-rate_ave']=df['heart-rate_ave'].astype('float64')
df['heart-rate_max']=df['heart-rate_max'].astype('float64')
df['weight']=df['weight'].astype('float64')
#checking dtypes again
df.dtypes
As seen above the column weight is in kilos and I understand pounds, so I will change that:
#check if weight is always in kilos
sum(df['weight']<100)+df['weight'].isna().sum() -len(df)
df['weight']=df['weight']*2.2046 #kilos to pounds conversion factor
#plotting the weight column
df.set_index('time')['weight'].plot.line(figsize=(10, 6))
plt.axvline(pd.to_datetime('2015-6-8'), color='r', linestyle='--', lw=2)
plt.axvline(pd.to_datetime('2016-6-15'), color='r', linestyle='--', lw=2);
Let's inspect the first ten rows again
df.head(10)
There is some missing distance information which appears in the comments of some rows. I will extract that and add that to the distance column
#change from meters to km first (see output of df.head() above)
df['distance']=df['distance'].apply(lambda x:round(x/1000,3))
#extract the values matching a pattern to a temporary series
#otherwise output zero
tmp=df['note'].apply(lambda x:
x.split(' km')[0].replace('Distance ','').replace(',','.')
if (isinstance(x, str) and x.find('Distance')>-1) else 0)
#there is still on special case where the string 'Distance' appears but not in the pattern
tmp[340]=0
tmp=tmp.astype('float64')
sum(tmp>0)
#replace missing values with values in temporary series
df.loc[tmp[tmp>0].index,'distance']=tmp[tmp>0]
#check that extraction was successful
df[tmp>0]
#check head of the dataframe again
df.head(10)
There is no 'speed' column, so it has to be calculated from the 'distance' and 'duration' rows. I will also make the 'duration' data uniform:
def getSpeed(x,t):
if isinstance(t, str):
if len(t) == 5:
tmp = datetime.strptime(t,"%M:%S")#string parsing method
if tmp.minute < 2:#not minutes, but hours
tmp = datetime.strptime(t,"%H:%M")
else:
tmp = datetime.strptime(t,"%H:%M:%S")
tot=tmp.hour+tmp.minute/60 +tmp.second/3600 #hours and fractions of hours
return round((0.6214*x)/(tot),2) #miles per hour
else:
return None
def makeUniform(t):
if isinstance(t, str):
if len(t) == 5:
tmp = datetime.strptime(t,"%M:%S")#string parsing method
if tmp.minute < 2:#not minutes, but hours
tmp = datetime.strptime(t,"%H:%M")
else:
tmp = datetime.strptime(t,"%H:%M:%S")
return timedelta(hours=tmp.hour, minutes=tmp.minute, seconds=tmp.second)
else:
return None
#vectorized version is about 4 times faster than a for loop
df['speed']=np.vectorize(getSpeed)(df['distance'], df['duration'])
#now make durations uniform (found doing this afterwards easier)
df['duration']=df['duration'].apply(lambda x:makeUniform(x))
#check head of the dataframe again
df.head(10)
I want to move the speed column after the distance one
col_list = list(df)
col_list
columnsTitles=['created',
'time',
'sport',
'duration',
'calories',
'distance',
'speed',
'heart-rate_ave',
'heart-rate_max',
'weight',
'note']
df=df.reindex(columns=columnsTitles)
df.head(20)
Now that I have a speed column I can compare running data outside (Running
) and inside (Treadmill
)
sports=df.groupby('sport')
#comparing outside and inside running
run=sports.get_group('Running')
indoors=sports.get_group('Treadmill')
plt.figure(figsize=(12,10))
plt.subplot(2, 1, 1)
plt.plot(run['time'],run['distance'], label='Running')
plt.plot(indoors['time'],indoors['distance'], label='Treadmill')
plt.ylabel('Distance (km)')
plt.legend()
plt.subplot(2, 1, 2)
plt.plot(run['time'],run['speed'], label='Running')
plt.plot(indoors['time'],indoors['speed'], label='Treadmill')
plt.ylabel('Speed (mph)')
plt.legend()
plt.show()
Making a plot of calories consumed per week
#for counting calories per week
def weekCount(df,col,tcol='time'):
week=pd.DataFrame(columns=['Date','value'])
low=datetime(2010,4,18)#starting first sunday of database
high=low+ timedelta(days=7)
last=df.iloc[-1,1] #time column of last row
i=0
while high <= last:
value=df[(low<=df[tcol]) & (df[tcol]< high)][col].sum()
week.loc[i]=[low,value]
low=high
high=low+ timedelta(days=7)
i +=1
return week.set_index('Date')['value']
calPerWeek=weekCount(df,'calories')
fig =calPerWeek.plot.line(figsize=(10, 6))
fig.set_ylabel("Calories per week");
storing the dataframe to a csv file:
df.to_csv('polarData.csv', index=False)